library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)
library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())
dat_path = "~/research/data/MorPhiC/December-2023/JAX_RNAseq_ExtraEmbryonic/"
dat_path = paste0(dat_path, "processed/Tables")meta_flat = read_excel("data/JAX_RNAseq_ExtraEmbryonic_meta_falttened.xlsx")
meta_flat = as.data.frame(meta_flat)
dim(meta_flat)## [1] 90 37
## Dataset
## 1 JAX; CRISPR-Cas9 KO; Primitive Syncytium, Extra-embryonic mesoderm; Bulk RNA-seq; 10X CRISPR; Illumina
## 2 JAX; CRISPR-Cas9 KO; Primitive Syncytium, Extra-embryonic mesoderm; Bulk RNA-seq; 10X CRISPR; Illumina
## Counts file
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002
## Sequence file read 1
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002_R1_001.fastq.gz
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002_R1_001.fastq.gz
## Sequence file read 2
## 1 PPARG_KO_H02_GT23-11450_GCGCAAGC-CGGCGTGA_S40_L002_R2_001.fastq.gz
## 2 PPARG_KO_G02_GT23-11449_ATATGGAT-TAATACAG_S50_L002_R2_001.fastq.gz
## INPUT LIBRARY PREPARATION ID RUN ID
## 1 GT23-11450 20230918_23-robson-010
## 2 GT23-11449 20230918_23-robson-010
## LIBRARY AVERAGE FRAGMENT SIZE LIBRARY INPUT AMOUNT VALUE
## 1 406 300
## 2 402 300
## LIBRARY INPUT AMOUNT UNIT LIBRARY FINAL YIELD VALUE LIBRARY FINAL YIELD UNIT
## 1 ngs 168.8 ngs
## 2 ngs 199.6 ngs
## LIBRARY CONCENTRATION VALUE LIBRARY CONCENTRATION UNIT LIBRARY PCR CYCLES
## 1 47.2 nM 10
## 2 66.8 nM 10
## LIBRARY PCR CYCLES FOR SAMPLE INDEX DIFFERENTIATED CELL LINE ID
## 1 NA PrS-MOK20026W-H02
## 2 NA PrS-MOK20026W-G02
## DIFFERENTIATED CELL LINE DESCRIPTION
## 1 KOLF2.2 deleted PPARG by deletion of full coding region (KO) derived primitive syncytium
## 2 KOLF2.2 deleted PPARG by deletion of full coding region (KO) derived primitive syncytium
## TIMEPOINT TIME-POINT UNIT TERMINALLY DIFFERENTIATED?
## 1 6 days Yes
## 2 6 days Yes
## Model System INPUT CELL LINE ID CLONE ID
## 1 extra-embryonic primitive syncytial cells MOK20026W-H02 H02
## 2 extra-embryonic primitive syncytial cells MOK20026W-G02 G02
## CELL LINE ID CELL LINE TYPE DERIVED FROM CELL LINE NAME
## 1 MOK20026W-H02 iPSC KOLF2.2J
## 2 MOK20026W-G02 iPSC KOLF2.2J
## CELL LINE DESCRIPTION
## 1 KOLF2.2 deleted PPARG by deletion of full coding region (KO)
## 2 KOLF2.2 deleted PPARG by deletion of full coding region (KO)
## ALTERED GENE SYMBOLS ALTERATION TYPE TARGETED GENOMIC REGION
## 1 PPARG deletion full coding region (KO)
## 2 PPARG deletion full coding region (KO)
## TARGETED GENOMIC REGION COORDINATES GUIDE SEQUENCE
## 1 chr3:12379702-12417187 CAACCATGGTCATTTCTGAA;GATCTTCTATGAAAGAGGGT
## 2 chr3:12379702-12417187 CAACCATGGTCATTTCTGAA;GATCTTCTATGAAAGAGGGT
## ZYGOSITY GENE EXPRESSION ALTERATION PROTOCOL ID
## 1 N.A JAXPE001
## 2 N.A JAXPE001
## LIBRARY PREPARATION PROTOCOL ID SEQUENCING PROTOCOL ID
## 1 JAXPL001 JAXPS001
## 2 JAXPL001 JAXPS001
## DIFFERENTIATION PROTOCOL ID
## 1 JAXPD001
## 2 JAXPD001
## [1] "Dataset"
## [2] "Counts file"
## [3] "Sequence file read 1"
## [4] "Sequence file read 2"
## [5] "INPUT LIBRARY PREPARATION ID"
## [6] "RUN ID"
## [7] "LIBRARY AVERAGE FRAGMENT SIZE"
## [8] "LIBRARY INPUT AMOUNT VALUE"
## [9] "LIBRARY INPUT AMOUNT UNIT"
## [10] "LIBRARY FINAL YIELD VALUE"
## [11] "LIBRARY FINAL YIELD UNIT"
## [12] "LIBRARY CONCENTRATION VALUE"
## [13] "LIBRARY CONCENTRATION UNIT"
## [14] "LIBRARY PCR CYCLES"
## [15] "LIBRARY PCR CYCLES FOR SAMPLE INDEX"
## [16] "DIFFERENTIATED CELL LINE ID"
## [17] "DIFFERENTIATED CELL LINE DESCRIPTION"
## [18] "TIMEPOINT"
## [19] "TIME-POINT UNIT"
## [20] "TERMINALLY DIFFERENTIATED?"
## [21] "Model System"
## [22] "INPUT CELL LINE ID"
## [23] "CLONE ID"
## [24] "CELL LINE ID"
## [25] "CELL LINE TYPE"
## [26] "DERIVED FROM CELL LINE NAME"
## [27] "CELL LINE DESCRIPTION"
## [28] "ALTERED GENE SYMBOLS"
## [29] "ALTERATION TYPE"
## [30] "TARGETED GENOMIC REGION"
## [31] "TARGETED GENOMIC REGION COORDINATES"
## [32] "GUIDE SEQUENCE"
## [33] "ZYGOSITY"
## [34] "GENE EXPRESSION ALTERATION PROTOCOL ID"
## [35] "LIBRARY PREPARATION PROTOCOL ID"
## [36] "SEQUENCING PROTOCOL ID"
## [37] "DIFFERENTIATION PROTOCOL ID"
## [1] "PrS-MOK20026W-H02" "PrS-MOK20026W-G02" "PrS-MOK20026W-E02"
## [4] "PrS-MOK20026P-E12" "PrS-MOK20026P-D10"
cell_line_id = str_split(meta_flat$`DIFFERENTIATED CELL LINE ID`, pattern = '-')
table(sapply(cell_line_id, length))##
## 3 4
## 66 24
cell_line_id = lapply(cell_line_id, function(x) { length(x) <- 4; x})
cell_line_id = as.data.frame(do.call(rbind, cell_line_id))
dim(cell_line_id)## [1] 90 4
## model_system cell_line clone_id condition
## 1 PrS MOK20026W H02 <NA>
## 2 PrS MOK20026W G02 <NA>
cell_line_id$condition[which(is.na(cell_line_id$condition))] = "nor"
table(meta_flat$`Model System`, cell_line_id$model_system)##
## ExM PrS
## extra-embryonic mesenchymal cells 12 0
## extra-embryonic primitive syncytial cells 0 78
##
## hyp nor
## ExM 0 12
## PrS 12 66
run_id and ko_strategy.w2update = which(meta_flat$`RUN ID` == "20231004_23-robson-008-run2")
meta_flat$run_id = meta_flat$`RUN ID`
meta_flat$run_id[w2update] = "20230918_23-robson-008"
table(meta_flat$run_id, cell_line_id$model_system, useNA="ifany")##
## ExM PrS
## 20230809_23-robson-005-run2 0 30
## 20230918_23-robson-008 0 24
## 20230918_23-robson-009 0 12
## 20230918_23-robson-010 0 12
## 20231004_23-robson-011 12 0
##
## hyp nor
## 20230809_23-robson-005-run2 0 30
## 20230918_23-robson-008 12 12
## 20230918_23-robson-009 0 12
## 20230918_23-robson-010 0 12
## 20231004_23-robson-011 0 12
##
## critical exon (CE) full coding region (KO)
## 24 24
## premature termination codon (PTC) <NA>
## 24 18
meta_flat$ko_strategy = str_extract(meta_flat$`TARGETED GENOMIC REGION`,
"\\(\\S+\\)$")
meta_flat$ko_strategy = str_remove_all(meta_flat$ko_strategy, "\\(|\\)")
meta_flat$ko_strategy[which(is.na(meta_flat$ko_strategy))] = "WT"
table(meta_flat$`TARGETED GENOMIC REGION`, meta_flat$ko_strategy,
useNA="ifany")##
## CE KO PTC WT
## critical exon (CE) 24 0 0 0
## full coding region (KO) 0 24 0 0
## premature termination codon (PTC) 0 0 24 0
## <NA> 0 0 0 18
meta_flat$ko_gene = meta_flat$`ALTERED GENE SYMBOLS`
meta_flat$ko_gene[which(is.na(meta_flat$ko_gene))] = "WT"
table(cell_line_id$model_system, meta_flat$ko_gene)##
## EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
## ExM 0 0 0 0 9 0 0 3
## PrS 18 9 9 9 0 9 9 15
##
## EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
## 20230809_23-robson-005-run2 0 9 0 9 0 9 0 3
## 20230918_23-robson-008 18 0 0 0 0 0 0 6
## 20230918_23-robson-009 0 0 9 0 0 0 0 3
## 20230918_23-robson-010 0 0 0 0 0 0 9 3
## 20231004_23-robson-011 0 0 0 0 9 0 0 3
##
## EPAS1 FOSB GCM1 GRHL1 ISL1 POU2F3 PPARG WT
## CE 6 3 3 3 3 3 3 0
## KO 6 3 3 3 3 3 3 0
## PTC 6 3 3 3 3 3 3 0
## WT 0 0 0 0 0 0 0 18
meta = cbind(meta_flat, cell_line_id)
cts = fread(file.path(dat_path, "genesCounts.csv"), data.table = FALSE)
dim(cts)## [1] 38592 91
## Name POU2F3_KO_F04_GT23-10504_GGTACCTT-GACGTCTT_S33_L001
## 1 ENSG00000268674 0
## 2 ENSG00000271254 1030
## PTC_A09__Hypoxia_GT23-11309_AACGTTCC-AGTACTCC_S30_L001
## 1 0
## 2 489
## PTC_D10__Hypoxia_GT23-11310_GCAGAATT-TGGCCGGT_S22_L001
## 1 0
## 2 603
## [1] TRUE
##
## TRUE
## 90
model_s = meta$model_system
get_q75 <- function(x){quantile(x, probs = 0.75)}
gene_info = data.frame(Name = cts$Name,
t(apply(cts_dat, 1, tapply, model_s, min)),
t(apply(cts_dat, 1, tapply, model_s, median)),
t(apply(cts_dat, 1, tapply, model_s, get_q75)),
t(apply(cts_dat, 1, tapply, model_s, max)))
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4),
rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)## [1] 38592 9
## Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674 0 0 0.0 0.0 0.00
## ENSG00000271254 ENSG00000271254 634 387 812.5 775.5 848.25
## PrS_q75 ExM_max PrS_max
## ENSG00000268674 0 0 3
## ENSG00000271254 923 948 1169
## ExM_min PrS_min ExM_median PrS_median
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 2.0 Median : 1.0
## Mean : 376.3 Mean : 255.1 Mean : 530.5 Mean : 585.1
## 3rd Qu.: 120.0 3rd Qu.: 60.0 3rd Qu.: 188.5 3rd Qu.: 182.5
## Max. :512761.0 Max. :154723.0 Max. :797550.5 Max. :379480.0
## ExM_q75 PrS_q75 ExM_max PrS_max
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 1
## Median : 3.0 Median : 3.0 Median : 6.0 Median : 10
## Mean : 608.8 Mean : 712.7 Mean : 751.6 Mean : 1102
## 3rd Qu.: 228.0 3rd Qu.: 233.8 3rd Qu.: 285.0 3rd Qu.: 405
## Max. :886970.0 Max. :456738.2 Max. :928104.0 Max. :801404
##
## FALSE TRUE
## FALSE 12757 959
## TRUE 2139 22737
## [1] 25835 90
gene_info = gene_info[w2kp,]
meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)
ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "KO type", shape = "Model") +
scale_color_brewer(palette = "Set1")meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = condition)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "Run ID", shape = "Condition") +
scale_color_brewer(palette = "Set1")sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
summary(meta_m$q75)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 361.0 422.2 469.8 478.4 540.5 592.0
stopifnot(all(meta_m$file_id == names(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)## [1] 18 23865
## [1] "sdev" "rotation" "center" "scale" "x"
## [1] 0.41336077 0.18367275 0.08208396 0.04567809 0.03608958
## [1] 41.3 18.4 8.2 4.6 3.6
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=model_system)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
labs(color="KO gene", shape ="KO strategy") +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=condition, color=run_id_short)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
labs(color="KO gene", shape ="KO strategy") +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)##
## ExM PrS
## 20230809_23-robson-005-run2 0 3
## 20230918_23-robson-008 0 6
## 20230918_23-robson-009 0 3
## 20230918_23-robson-010 0 3
## 20231004_23-robson-011 3 0
## Generate sample information matrix
cols2kp = c("model_system", "condition", "q75")
sample_info = meta_m[,cols2kp]
dim(sample_info)## [1] 18 3
## model_system condition q75
## 12 PrS nor 569.5
## 68 PrS hyp 545.0
sample_info$log_q75 = log(sample_info$q75)
dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info,
~ model_system + condition + log_q75)
dds = DESeq(dds)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)## [1] 25835 6
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 731.89279 0.0114876 0.2318513 0.04954726 0.9604832
## ENSG00000277196 46.50349 0.9817827 1.3388605 0.73329722 0.4633772
## padj
## ENSG00000271254 0.9997047
## ENSG00000277196 0.9997047
##
## FALSE TRUE
## 17751 8084
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("Read depth")
print(g0)## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ condition + log_q75)
res_lrt = results(dds_lrt)
res_model_system = as.data.frame(res_lrt)
dim(res_model_system)## [1] 25835 6
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 731.89279 0.0114876 0.2318513 4.06378556 0.04381218
## ENSG00000277196 46.50349 0.9817827 1.3388605 0.04552192 0.83104721
## padj
## ENSG00000271254 0.06908888
## ENSG00000277196 0.87097036
##
## FALSE TRUE
## 23736 2099
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("Model system")
print(g0)## association with condition
dds_lrt = DESeq(dds, test="LRT", reduced = ~ model_system + log_q75)
res_lrt = results(dds_lrt)
res_condition = as.data.frame(res_lrt)
dim(res_condition)## [1] 25835 6
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 731.89279 0.0114876 0.2318513 0.2926949 0.58849873
## ENSG00000277196 46.50349 0.9817827 1.3388605 2.9420549 0.08630088
## padj
## ENSG00000271254 0.6952827
## ENSG00000277196 0.1494475
##
## FALSE TRUE
## 22739 3096
g0 = ggplot(res_condition %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("Condition")
print(g0)meta$model_condition = paste(meta$model_system, meta$condition, sep="_")
table(meta$model_condition)##
## ExM_nor PrS_hyp PrS_nor
## 12 12 66
##
## ExM_nor PrS_hyp PrS_nor
## 20230809_23-robson-005-run2 0 0 30
## 20230918_23-robson-008 0 12 12
## 20230918_23-robson-009 0 0 12
## 20230918_23-robson-010 0 0 12
## 20231004_23-robson-011 12 0 0
for(model1 in unique(meta$model_condition)){
print(model1)
sample2kp = which(meta$model_condition == model1)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
table(meta_m$ko_strategy, substr(colnames(cts_dat_m), 1, 7))
stopifnot(all(meta_m$file_id == names(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(model1) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(model1) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
table(meta_m$run_id, meta_m$ko_gene)
## Generate sample information matrix
cols2kp = c("ko_strategy", "ko_gene", "run_id", "q75")
sample_info = meta_m[,cols2kp]
dim(sample_info)
sample_info[1:2,]
table(sample_info$ko_strategy, sample_info$ko_gene)
sample_info$ko_group = paste(sample_info$ko_gene, sample_info$ko_strategy, sep="_")
table(sample_info$ko_group)
sample_info$log_q75 = log(sample_info$q75)
if(model1 == "PrS_nor"){
dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info,
~ ko_group + run_id + log_q75)
}else{
dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info,
~ ko_group + log_q75)
}
dds = DESeq(dds)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
res_rd[1:2,]
table(is.na(res_rd$padj))
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(model1, " read depth"))
print(g0)
## association with run_id
if(model1 == "PrS_nor"){
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
res_lrt = results(dds_lrt)
res_run_id = as.data.frame(res_lrt)
dim(res_run_id)
res_run_id[1:2,]
table(is.na(res_run_id$padj))
g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(model1, " Run ID"))
print(g0)
}
## DE analysis for each KO gene
table(sample_info$ko_gene)
genos = setdiff(unique(sample_info$ko_gene), "WT")
genos
ko_grp = c("CE", "KO", "PTC")
res_geno_df = NULL
for(geno in genos){
res_geno = list()
for(k_grp1 in ko_grp){
res_g1 = results(dds, contrast = c("ko_group",
paste0(geno, "_", k_grp1), "WT_WT"))
res_g1 = signif(data.frame(res_g1), 3)
res_geno[[k_grp1]] = res_g1
g1 = ggplot(res_g1 %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(geno, "_", k_grp1))
print(g1)
tag1 = sprintf("%s_%s_%s_", model1, geno, k_grp1)
colnames(res_g1) = paste0(tag1, colnames(res_g1))
if(is.null(res_geno_df)){
res_geno_df = res_g1
}else if(!is.null(res_geno_df)){
stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
res_geno_df = cbind(res_geno_df, res_g1)
}
}
get_index <- function(x){
which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
}
w_ce = get_index(res_geno[["CE"]])
w_ko = get_index(res_geno[["KO"]])
w_ptc = get_index(res_geno[["PTC"]])
m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce],
"KO" = rownames(res_geno[["KO"]])[w_ko],
"PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
g_up = UpSet(m)
print(g_up)
df1 = data.frame(padj_CE = res_geno[["CE"]]$padj,
padj_KO = res_geno[["KO"]]$padj,
padj_PTC = res_geno[["PTC"]]$padj)
cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_CE))) +
geom_pointdensity() + ggtitle(sprintf("%s, Spearman r = %.2f", geno, cr1)) +
scale_color_viridis()
g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_KO))) +
geom_pointdensity() + ggtitle(sprintf("%s, Spearman r = %.2f", geno, cr2)) +
scale_color_viridis()
print(g4)
print(g5)
}
dim(res_geno_df)
res_geno_df[1:2,]
res_df = data.frame(gene_ID=rownames(res_geno_df), res_geno_df)
dim(res_df)
res_df[1:2,]
fwrite(res_df, sprintf("results/2023_12_%s_DEseq2.tsv", model1), sep="\t")
}## [1] "PrS_nor"
## [1] "PrS_hyp"
## [1] "ExM_nor"
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 7735598 413.2 15844465 846.2 NA 15844465 846.2
## Vcells 29440771 224.7 96475633 736.1 65536 96475633 736.1
## R version 4.2.3 (2023-03-15)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] readxl_1.4.2 UpSetR_1.4.0
## [3] ComplexHeatmap_2.14.0 data.table_1.14.8
## [5] dplyr_1.1.2 ggpointdensity_0.1.0
## [7] viridis_0.6.2 viridisLite_0.4.1
## [9] DESeq2_1.38.3 SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.0.0 GenomicRanges_1.50.2
## [15] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [17] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [19] RColorBrewer_1.1-3 MASS_7.3-58.2
## [21] stringr_1.5.0 ggpubr_0.6.0
## [23] ggrepel_0.9.3 ggplot2_3.4.2
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.1-0 ggsignif_0.6.4 rjson_0.2.21
## [4] circlize_0.4.15 XVector_0.38.0 GlobalOptions_0.1.2
## [7] clue_0.3-65 rstudioapi_0.14 farver_2.1.1
## [10] bit64_4.0.5 AnnotationDbi_1.60.2 fansi_1.0.4
## [13] codetools_0.2-19 doParallel_1.0.17 cachem_1.0.7
## [16] geneplotter_1.76.0 knitr_1.44 jsonlite_1.8.4
## [19] broom_1.0.4 annotate_1.76.0 cluster_2.1.4
## [22] png_0.1-8 compiler_4.2.3 httr_1.4.6
## [25] backports_1.4.1 Matrix_1.6-4 fastmap_1.1.1
## [28] cli_3.6.1 htmltools_0.5.5 tools_4.2.3
## [31] gtable_0.3.3 glue_1.6.2 GenomeInfoDbData_1.2.9
## [34] Rcpp_1.0.10 carData_3.0-5 cellranger_1.1.0
## [37] jquerylib_0.1.4 vctrs_0.6.2 Biostrings_2.66.0
## [40] iterators_1.0.14 xfun_0.39 lifecycle_1.0.3
## [43] rstatix_0.7.2 XML_3.99-0.14 zlibbioc_1.44.0
## [46] scales_1.2.1 parallel_4.2.3 yaml_2.3.7
## [49] memoise_2.0.1 gridExtra_2.3 sass_0.4.5
## [52] stringi_1.7.12 RSQLite_2.3.1 foreach_1.5.2
## [55] BiocParallel_1.32.6 shape_1.4.6 rlang_1.1.0
## [58] pkgconfig_2.0.3 bitops_1.0-7 evaluate_0.20
## [61] lattice_0.20-45 purrr_1.0.1 labeling_0.4.2
## [64] bit_4.0.5 tidyselect_1.2.0 plyr_1.8.8
## [67] magrittr_2.0.3 R6_2.5.1 generics_0.1.3
## [70] DelayedArray_0.24.0 DBI_1.1.3 pillar_1.9.0
## [73] withr_2.5.0 KEGGREST_1.38.0 abind_1.4-5
## [76] RCurl_1.98-1.12 tibble_3.2.1 crayon_1.5.2
## [79] car_3.1-2 utf8_1.2.3 rmarkdown_2.21
## [82] GetoptLong_1.0.5 locfit_1.5-9.8 blob_1.2.4
## [85] digest_0.6.31 xtable_1.8-4 tidyr_1.3.0
## [88] munsell_0.5.0 bslib_0.4.2